# -*-Python-*-
# same hyperparameters as bert_large

utils.run.model_type = "bitransformer"
d_model = 1024
num_layers = 24
d_ff = 4096
num_heads = 16
d_kv = 64
utils.tpu_mesh_shape.model_parallelism = 2
